{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/dictionary/dialect/kedah.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "kedah = pd.read_csv('kedah.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import malaya\n",
    "\n",
    "malays = malaya.texts._malay_words._malay_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from unidecode import unidecode\n",
    "\n",
    "def cleaning(string):\n",
    "    string = unidecode(string).replace('.', '. ').replace(',', ' , ')\n",
    "    string = re.sub('[^\\'\"A-Za-z\\- ]+', ' ', string)\n",
    "    string = re.sub(r'[ ]+', ' ', string.lower()).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = []\n",
    "for i in range(kedah.shape[0]):\n",
    "    try:\n",
    "        words.extend(cleaning(kedah['0'].iloc[i]).split())\n",
    "    except:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://www.facebook.com/jjcmkedah/posts/195-perkataan-loghat-kedah-yang-biasa-digunakan-dalam-perbualan-harian-pakat-pak/810404492407378/\n",
    "\n",
    "additional = \"\"\"\n",
    "1 qhe-ngau.................cakar\n",
    "2 se-qheyat.................reda/surut\n",
    "3 hambat.................kejar\n",
    "4 pe-qhembang.................bidik (aim)\n",
    "5 contiang.................lukis\n",
    "6 ligan.................kejar\n",
    "7 cemuih.................bosan/jemu\n",
    "8 tokak.................gigit\n",
    "9 gey-qhek.................basikal\n",
    "10 gon.................cerun (bukit kecil)\n",
    "11 belemoih.................comot/kotor\n",
    "12 tetomoih.................tersungkor\n",
    "13 ku-tey.................cubit\n",
    "14 tak dan.................tak sempat\n",
    "15 hebiaq.................celupar\n",
    "16 maneh melecaih.................tersangat manis\n",
    "17 Lincun/lencun.................basah\n",
    "18 lokoih.................kuyup\n",
    "19 kawaq.................pencuri\n",
    "20 kanyaq.................mentah (berhingus)\n",
    "21 ketegaq.................degil\n",
    "22 he-qhot petot.................senget benget\n",
    "23 me-qhela.................gatal\n",
    "24 log laq.................cemekap/cuai\n",
    "25 hawing.................pusing\n",
    "26 punggai.................baling/lempar\n",
    "27 tengalong.................tuju/baling\n",
    "28 me-qhe-kah.................Kupas/koyak\n",
    "29 ayaq achaq.................air longkang\n",
    "30 ha-qhin.................hancing\n",
    "31 kelepiaq.................melibas\n",
    "32 kelolo.................\"tak berapa betoi\"\n",
    "33 hemoi.................\"muka tak malu\"\n",
    "34 me-qhe-nyeh.................menggatal\n",
    "35 da-oh.................tak \"matching\"\n",
    "36 hapak pera'e.................bau hapak\n",
    "37 hijau me-qhe-ngak.................telalu \"hijau\"\n",
    "38 celuih.................muat\n",
    "39 me-qhelit me-qhelap.................berkilau-kilau\n",
    "40 me-qhe-yup.................jauh/sayup\n",
    "41 qhem-pang.................\"tergedik-gedik\"\n",
    "42 kampoi.................kumpul/kaut\n",
    "43 je-qhang.................rebus\n",
    "44 capoi.................celupar\n",
    "45 te-qhe-bey.................lastik\n",
    "46 melo-qhoih.................lucut/gelongsor\n",
    "47 me-qho-not.................membongkok\n",
    "48 sene-qheh.................selisih\n",
    "49 be-qha-chaq.................comot/kotor\n",
    "50 kahaq.................nakal\n",
    "51 pedo.................hati-hati\n",
    "52 peysos.................urus\n",
    "53 hapuih.................tenggelam (lemas)\n",
    "54 qhan-dok.................redah\n",
    "55 se-ghoh.................kebah/kurus\n",
    "56 qhen-jong.................tinggi lampai\n",
    "57 pe-qho-ngoih.................panas baran\n",
    "58 takek.................ketuk/potong\n",
    "59 be-qhemba.................bersaing/lumba\n",
    "60 qhe-tiaq.................ikhtiar/fikir\n",
    "61 ge-qhet.................gigit\n",
    "62 tahaq.................tahan\n",
    "63 qha-gaih.................panjat\n",
    "64 melewaq.................merayau\n",
    "65 melahaq.................kebulur\n",
    "66 qha-bat.................naik/panjat\n",
    "67 qhim-bok.................belasah\n",
    "68 melilau.................jalan-jalan\n",
    "69 cuak.................seram\n",
    "70 juih.................mulut muncung\n",
    "71 luku.................ketuk kepala\n",
    "72 qhodong.................melulu\n",
    "73 qhedah.................redah\n",
    "74 ayaq seqhebat.................air sirap\n",
    "75 peqhasat.................sangat dengki\n",
    "76 mandom.................lembap\n",
    "77 ceqhoi.................sangat cair\n",
    "78 tepok.................lumpuh\n",
    "79 qha-bit.................terkoyak\n",
    "80 kok-ko.................dukung\n",
    "81 mai.................mari\n",
    "82 terple'ot.................terseliuh\n",
    "83 gomoi.................bergaduh/bergusti\n",
    "84 ketit.................cubit\n",
    "85 ga-qhet.................gatal/mengada-ada\n",
    "86 nala.................tersangat\n",
    "87 mang-kark.................buah tak masak\n",
    "88 bergenuak.................berduyun-duyun\n",
    "89 pediah.................makan\n",
    "90 polok.................makan penuh-penuh\n",
    "91 tey-liang.................senget\n",
    "92 qhe-ling.................jeling\n",
    "93 peng-kaq.................kaki tempang\n",
    "94 me-qhe-nyut.................serabut\n",
    "95 kesot.................alih\n",
    "96 lanyau.................bersihkan\n",
    "97 cempong.................angkat/dukung\n",
    "98 cepung.................kepung\n",
    "99 conek.................ekor\n",
    "100 depa.................mereka\n",
    "101 qha-let.................leka/cuai\n",
    "102 cey-mey.................mata bengkak\n",
    "103 bakup.................mata layu\n",
    "104 katok.................ketuk/pukul\n",
    "105 buah ce-qhe-mai.................buah kat kedah!\n",
    "106 buah qhe-yang-dot.................buah kat kampung2\n",
    "107 ikan ka-qhen.................ikan laga\n",
    "108 bijik qhe-mia.................warna keunguan\n",
    "109 ikan temenung.................ikan kembong\n",
    "110 sendai.................ketatkan/kemaskan\n",
    "111 ketuat.................kutil\n",
    "112 mengacom.................mengadu\n",
    "113 mengkelan.................tersangkut makanan\n",
    "114 kueh dangai.................kueh dari kelapa parut\n",
    "115 kueh ka-qhas.................kueh macam \"bihun\"\n",
    "116 mot.................bawak/muat\n",
    "117 en-jos.................exzos\n",
    "118 pedai ayam.................kunci mangga/padlock\n",
    "119 pre-wel.................sprocket/freewheel\n",
    "120 pe-daq.................rasa macam \"colgate\"\n",
    "121 to-qheh.................turis/calar\n",
    "122 tungging buyung.................\"somersault\"\n",
    "123 ponen.................pondan/lembut\n",
    "124 Kesot.................Injak\n",
    "125 Qhamaih.................Ramas\n",
    "126 Pulun.................Bersungguh-sunguh\n",
    "127 Kedeqhat.................Kudrat\n",
    "128 Biaq.................Biar\n",
    "129 Rabat.................Panjat\n",
    "130 Ragaih.................Panjat\n",
    "131 Ghamei.................Suka/gembira\n",
    "132 Terebey.................Lastik\n",
    "133 Cewi.................Belagak dgn barang baru beli\n",
    "134 Belengaih.................Kotor\n",
    "135 Contiang.................Lukis\n",
    "136 Mengkala.................Kalau\n",
    "137 Katik.................Kecik\n",
    "138 Celapak.................Dating\n",
    "139 Se-gheyau.................Takut-takut\n",
    "140 Mengketedarah.................Makan\n",
    "141 Melahaq.................Makan\n",
    "142 Hampa.................Mereka\n",
    "143 Depa.................Mereka\n",
    "144 Merenyam.................Gatal (playgirl)\n",
    "145 Tak dan.................Tak sempat\n",
    "146 Nyeh.................Buang hingus\n",
    "147 Ghe-nyeh.................Senyum nampak gigi\n",
    "148 Singkiaq.................Singkir\n",
    "149 Keleboq.................Paya\n",
    "150 Pulun.................Sungguh-sungguh\n",
    "151 Lagu mana.................Macam mana\n",
    "152 Pok kai.................Duit dah abis\n",
    "153 Terejai.................Cakap secara kasar\n",
    "154 Hawaq.................Penyamun\n",
    "155 Ghe-nyeh.................Senyum terlebih\n",
    "156 Kuih.................Buang\n",
    "157 Kelepiaq.................Tangan bergerak ke sana sini\n",
    "158 Lau.................Reban ayam\n",
    "159 Toslet.................Lampu suluh\n",
    "160 Pelekoh.................Pukul\n",
    "161 Luku.................Pukul dengan guna tangan\n",
    "162 Ber-ghetoh.................Berlaga\n",
    "163 Merengit.................Hitam\n",
    "164 Kahaq.................Kasar\n",
    "165 Cekeding.................Kurus\n",
    "166 Lit-lit.................Panas\n",
    "167 Tere.................Pandai\n",
    "168 Hampa.................Mereka\n",
    "169 Ligan.................Kejar\n",
    "170 Gelema.................Kahak\n",
    "171 Herot petot.................Bengkang bengkok\n",
    "172 Je gheluih.................Timbus\n",
    "173 Melahaq.................Makan\n",
    "174 Pengkaq.................Jalan tak betul\n",
    "176 Puruih.................Lidi\n",
    "177 Tembun.................Gemuk\n",
    "178 Cemuih.................Bosan\n",
    "179 Cenge.................Garang\n",
    "180 Perenguih.................Ganas\n",
    "181 Juruih.................Sirim air\n",
    "182 Pelaq.................Kurang ajaq\n",
    "183 Kurang ajaq.................Pelaq\n",
    "184 Hemoi.................Tak kisah\n",
    "185 Lokoih.................Basah\n",
    "186 koi-koi.................Pelan-pelan\n",
    "187 Baloq liat.................Malas\n",
    "188 Terpe gheluih.................Jatuh\n",
    "189 Terpelohong.................Terbukak\n",
    "190 Tak kelapaq.................Tak ingin\n",
    "191 Tergelinciaq.................Tergelincir\n",
    "192 Koroi.................Kuruih\n",
    "193 Koro.................Sorang-sorang\n",
    "194 Kelepiaq.................Tangan bergerak kesana sini\n",
    "195 Gabuih.................Cuci\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "for line in additional.split('\\n'):\n",
    "    if not len(line):\n",
    "        continue\n",
    "    words.append(cleaning(line.split('.................')[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "words.extend(['kekwat', 'pasai pa', 'nalanya', 'nala',\n",
    "             'menyebai', 'tertomoih', 'serlum', 'tak dan', 'ayaq nyok',\n",
    "             'hawin', 'tauk', 'sigung', 'handuh', 'ketumbit', 'terai',\n",
    "             'punggai', 'melengung', 'beghetuh', 'seneghih', 'calaq',\n",
    "             'seluloi', 'ngenjot', 'kebuloq', 'tibai', 'menyoronot',\n",
    "             'benyai', 'kelepiaq', 'seluloi', 'ngenjot',\n",
    "             'kebuloq', 'kelepiaq', 'ghabat', 'megheben', 'kawaq',\n",
    "             'lumpoq', 'keleboq', 'segheyat', 'cedoq', 'simboq', 'hawin'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "additional = \"\"\"\n",
    "air = ayaq\n",
    "dahaga air = lapaq ayaq\n",
    "beritahu/bagi tau = habaq\n",
    "basikal = beskat gheghek\n",
    "cuai/clumsy = cemerkap\n",
    "tidur = tidoq\n",
    "saya = Chek\n",
    "awak = hangpa\n",
    "kamu = hang\n",
    "mereka = depa\n",
    "tension = sakit pala\n",
    "kicap=toyu\n",
    "pondan=darai\n",
    "air kumbah= ayaq acaq\n",
    "pukul=tibai\n",
    "gila/tak betui = loqlaq\n",
    "eksyen/megah = ciwi\n",
    "kelam kabut = kalut\n",
    "conteng = contiang\n",
    "sawah = bendang\n",
    "Gemuk = Topui\n",
    "manja=merap\n",
    "ceruk=lachuk\n",
    "bosan=cemuih\n",
    "tukul=merton\n",
    "pencuri = kawaq\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "for line in additional.split('\\n'):\n",
    "    if not len(line):\n",
    "        continue\n",
    "    words.append(cleaning(line.split('=')[-1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "additional=\"\"\"\n",
    "Aci ligan - main hambat-hambat/kejaq-kejaq aka kejar-mengejar\n",
    "Ayaq acaq - ayaq lumpuq/lecah/becak aka ayaq longkang\n",
    "Awang - panggey kat budak2 (ni awang hg mai sait!!)\n",
    "Awat - pasai pa/ada apa aka kenapa\n",
    "Bakiaq - dalam botoi susu bayi yg dah basi (lapisan terapung kat atas)\n",
    "Baloq liat - pemalaih tahap cipan\n",
    "Balun - tabuh aka pukui (pukul)/belasah\n",
    "Bambu - pukui (hang jangan dok wat keraih kepala,stg naya aku bambu hg)\n",
    "Batas - bataih aka jalan\n",
    "Beghemba - bersaing (budak2 nek beskal ramai2)\n",
    "Belah - dekat cth hg ada belah mana? belah bole guna tuk lari buang lah!\n",
    "Belahak - sendawa... Alhamdulillah\n",
    "Belemoih - kotoq/kusut masai/tak terurus\n",
    "Belen - makanan yg berlebihan p tukaq dari balance / kalau yg tak elok dpa panggey untuk bohsia,gadis peleseran\n",
    "Belengaih - pengotoq (kotor)\n",
    "Belutin - kotoq \n",
    "Berderemen - muka calaq balaq (lepas jatuh)\n",
    "Beretuh / Beghetuh - terlanggaq\n",
    "Besaq gabai - seperti besaq gajah / besaq gila\n",
    "Bok - tilam (Kedah)\n",
    "Bon - polis/mata-mata\n",
    "Borak - cakap besaq atau pun pondan\n",
    "Benyai - lembik berayaq (cth:isi durian)\n",
    "Biruu - almari (Kedah)\n",
    "Cempeqha/cempera - perangai tak tentu hala/jakun/wat sempoi\n",
    "Cabai - cili\n",
    "Camca - sudu\n",
    "Camdek/candek - mengutuk / perli\n",
    "cheq - gelaran bagi saya\n",
    "Cekeding - kurus kering\n",
    "Celuih - cukup,muat\n",
    "Cemerkap - tak cekap\n",
    "Cengeh - garang\n",
    "Ceroi - ayaq yg cayaq\n",
    "Chemuih - bosan/jemu/meluat\n",
    "Chewi - perasaan sayang terhadap sesuatu yg tk bertempat\n",
    "Chombi - panggilan manja untuk anak perempuan/gelaran anak mami penang\n",
    "Contiang - conteng\n",
    "Cucoq/coq - cucur (cth:coq pisang/coq badak)\n",
    "Cunggit - baju yg senget sebelah / lari selamatkan diri\n",
    "Darai - pondan\n",
    "Daun karipole - daun kari\n",
    "Demah - pukul/bagi tengah belakang\n",
    "Depa - mereka / diaorang\n",
    "Gagau - curi / ambil\n",
    "Gerek - basikal (Perlis)\n",
    "Ghabat - panjat\n",
    "Ghaplah jingga - buat perangai/bagi pening kepala\n",
    "Ghelaih - bilas/basuh/lap\n",
    "Gherit - baju koyak kena gigit tikus (Kedah)\n",
    "Ghiang - kederat aka tenaga\n",
    "Gon - tempat curam skit atau curam\n",
    "Gostan - undur ke belakang\n",
    "Gurmit - pengasah pensil\n",
    "Guruih api - mancis\n",
    "Habaq mai - bagitau aka beritahu\n",
    "Hamboq - cakap lepas/marah maki\n",
    "Haprak - apa pun tk boleh\n",
    "Haria - pukul atau untuk tag line team bola pulau pinang laungan semangat \"haria penang haria''\n",
    "Hambat - kejaq/halau\n",
    "Haluih - kecik sangat\n",
    "Hangpa - engkau/korang\n",
    "Hawaq - perangai tak elok/buas\n",
    "Hingaq - bising\n",
    "Jamban - tandas\n",
    "Jebon - seperti memek muka masam,sejenis musang\n",
    "Jerkah - tengking\n",
    "Jora - tunjuk hebat/dasyat\n",
    "Kalut - kelam kabut/kalang kelibut\n",
    "Kambee - alang rumah\n",
    "Kamjat - bengang/makian\n",
    "Kanyiaq - budak hingusan/budak2 baru nk 'up' hahaha\n",
    "Kapok - peluk\n",
    "Katok - ketuk/pukul (dgn batang kayu)\n",
    "Katup - tutup\n",
    "Kawaq - pencuri dlm kampung\n",
    "Kecek - pujuk/bodek (cth nk minta duit)\n",
    "Kedekut pait - kedekut haji bakhil\n",
    "Kalih - pusing tengok\n",
    "Kelepiaq - tepis / lepas basuh tangan kelepiaq ayaq\n",
    "Keloi - panggil (Kedah)\n",
    "Kemut - kedekut\n",
    "Keraih kedekiang - keras mcm batu\n",
    "Kerumun - mengelilingi seperti nak lihat sesuatu\n",
    "Ketegaq - degil / keraih kepala\n",
    "Koman - lekeh mcm takdak class\n",
    "Kona - pusing,belok \n",
    "Kongkiaq – sejenis semut kepala dan taring besar\n",
    "Kopek – kupas (buah) / beg duit lelaki\n",
    "Koyak minyak – pulas minyak\n",
    "Kore – banyak berkira\n",
    "Kutey – cubit\n",
    "Latam – pijak\n",
    "Lancha – beca\n",
    "Langgah – minum guna botol tak guna cawan (Kedah)\n",
    "Lehiaq – leher tapi sebut pelik skit\n",
    "Lekaih – cepat \n",
    "Lencun@basah lencun – basah sangat(mcm seluruh badan) *refer basah lokoih kat atas\n",
    "Lengai – lembab\n",
    "Ligan – kejar\n",
    "Likat – pekat\n",
    "Lingkup – jahanam/punah\n",
    "Lintaq – kilat/guruh\n",
    "Loncat – lompat\n",
    "Loqlaq – clumsy @ suka buat menda bukan-bukan (Kedah)\n",
    "Lunyai – sampai lembek (cth: belasah sampai ‘lunyai’)\n",
    "Maarop Sintok – maki hamun untuk org bodoh/bangang (cth:Maarop Sintok mana la yg buang smpah kt sini)\n",
    "Makan poloq – makan pelahap/gelojoh\n",
    "Mampoih – mampos\n",
    "Mami – mak/makcik\n",
    "Mamu – pakcik\n",
    "Mana aci – tak adil\n",
    "Mandom – lembap/tak aktif\n",
    "Manih melecaih – manis sgt2\n",
    "Mankaq – isi durian yg keras tuh/mcm tak tau apa2 lah\n",
    "Mapley – pengantin lelaki kalau kat penang (klau kat KL = mamak)\n",
    "Marka – awek\n",
    "Mastoura – set barang kemas termasuk rantai yang loket dia sambung ke rantai\n",
    "Meksi – jubah/dress\n",
    "Merah merengau – merah sangat terang\n",
    "Melahaq – makan gelojoh (Kedah)\n",
    "Meluat – benci/menyampah\n",
    "Melugai – meloya/perut tak selesa/ rasa macam nak muntah\n",
    "Meneghas – mengulangkaji @ mengulang baca Al Quran\n",
    "Meneghu-meneghang – kereta lalu lalang dengan sibuk sekali (Kedah)\n",
    "Mengkala – bila cth: mengkala depa nak sampai ni (Kedah)\n",
    "Meqhela – gedik (gatai menqhela)\n",
    "Merap – manja\n",
    "Merenyam – gatai miang jgk (Kedah:meqhenyam)\n",
    "Merelop – senja\n",
    "Meqeloh - tidoq\n",
    "Meghelit – berkilau-kilauan\n",
    "Meqoyak/ Meghoyak – merebak (cth:penyakit)\n",
    "Merton – hammer/penukul\n",
    "Morey – bubuq masjid\n",
    "Mot – tunggang\n",
    "Musibat – anak hantu/makian\n",
    "Mulut hebiaq - mulut takdak insurans\n",
    "Mengketedarah – makan tak ingat dunia/ lapaq gila (eg:awak hang ketedarah nasi ka)\n",
    "Nalla – besaq punya!\n",
    "Nana – abang/ panggilan untuk suami \n",
    "Nangoi – budak hingusan\n",
    "Nerai – sebaris (cth: Kedai tu satu nerai dengan Line Clear)\n",
    "Nyok – kelapa (ayaq nyok)\n",
    "Paket - poket baju\n",
    "Panchoq – tandas\n",
    "Pandel – khemah\n",
    "Parpu – kacang dal yg buat dalca tu (parpu ni bahasa melayu, tp rmai xtau sbb penang yg slalu pkai, dal/dalca tu bhs tamil)\n",
    "Pasemboq – rojak mamak kalau kat KL\n",
    "Pedajai – di perkenakan / terkena\n",
    "Pedoo – hati-hati/jaga\n",
    "Pelaq – buat terok/makian\n",
    "Pepak – mengunyah atau nak hancurkan gula-gula(atau benda keras) dlm mulut\n",
    "Perabih buang – makan sampai licin\n",
    "Perak/peghak – heran tengok benda tak pernah jumpa(jakun)\n",
    "Pokkay – sesumpah\n",
    "Ponen – pondan/bapok\n",
    "Ponu - pengantin perempuan \n",
    "Punggai – baling sesuatu\n",
    "Pungkoq – bontot\n",
    "Qragaih – panjat\n",
    "Reban/Gheban – runtuh (rumah itu reban) *bukan reban ayam tu\n",
    "Renjong/qhenjong – tinggi/panjang\n",
    "Renyeh/qhenyeh – senyum smpai ke telinga\n",
    "Reyau/qhiyau – menangis\n",
    "Ronda – jalan2 (jom ronda!)\n",
    "Rosyom – muka masam ja\n",
    "Qhengkong – kerengkong\n",
    "Sat – sekejap\n",
    "Sebek – muka sebek (muka sebelum nak mengangis)\n",
    "Segan – klau bahasa buku, segan tu maksudnya malu, tapi bagi orang Penang ‘malas’\n",
    "Seghiyau – takut\n",
    "Seghobeh – serabai\n",
    "Sekeh – luku kepala sipi2 (tak kuat sgt la)\n",
    "Selepong – bercapuk, bertampung\n",
    "Semawa – ajak/jemput\n",
    "Seneqih – calar (biasanya pada body kereta)\n",
    "Sengkek – pokai\n",
    "Siru – berlagak\n",
    "Solat lohoq – solat zuhur\n",
    "Soq – sahur\n",
    "Suap gula – betunang (atau adat semasa tunang)\n",
    "Taboh – bantai/pukul/belasah\n",
    "Tarantuk – beretuh kena dinding atau berlaga kepala\n",
    "Tak celuih – tak muat/tak cukup\n",
    "Takdan – tak sempat\n",
    "Tak kelapaq – tak mau/tak hingin\n",
    "Tamboq – lumpur/selut\n",
    "Tanjung – pergi bandar atau pekan untuk panggilan penang (Jom p tanjung nek lanca...hehe)\n",
    "Taqham – dendam simpan\n",
    "Tarak mau – tak mau\n",
    "Tauk – campak\n",
    "Tawaq – tawar atau tiada rasa\n",
    "Tenyeh – mengesat sesuatu eg: tenyeh mata\n",
    "Terjuntai - terkeluar dari sesuatu tempat (refer filem P Ramlee Masam Masam Manis)\n",
    "Terbei/teghebei – lastik(Kedah)\n",
    "Tergheliat –terseliuh\n",
    "Terpereluih – terperosok\n",
    "Tetomoih – digunakan untuk jatuh yang teruk (jatuh tetomoih)\n",
    "Toksah – jangan\n",
    "Tokua – tauhu\n",
    "Tok yu – kicap\n",
    "Tokak – gigit (tapi dengan bukak mulut besar2)\n",
    "Tonyoh – gosok/tenyeh dengan kuat\n",
    "Toya – perangai/makian\n",
    "Toya Mat Amin – perlian, kutukan (yang ni lawak, tak pasal2 mat amin mana tah kena)\n",
    "Tuntun – maknanya lebih kurang mcm tolak…sesuai digunakan utk basikal,motor\n",
    "Ubi mengala – ubi kentang\n",
    "Usha – survey/lihat (cth: jom ‘usha’ marka tu)\n",
    "Ya’apur seyre – trouble maker (rujuk Anak Mami movie)\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "for line in additional.split('\\n'):\n",
    "    cleaned = cleaning(line)\n",
    "    if len(cleaned) < 3:\n",
    "        continue\n",
    "    c = cleaning(unidecode(line).split('-')[0]).replace('atau', '/').split('/')\n",
    "    \n",
    "    words.extend([cleaning(i) for i in c])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'acaq',\n",
       " 'aci ligan',\n",
       " 'acilut',\n",
       " 'agaq-agaq',\n",
       " 'ahia',\n",
       " 'ayaq',\n",
       " 'ayaq acaq',\n",
       " 'ayaq achaq',\n",
       " 'ayaq nyok',\n",
       " 'ayaq seqhebat',\n",
       " 'bakiaq',\n",
       " 'baloq liat',\n",
       " 'be-qha-chaq',\n",
       " 'be-qhemba',\n",
       " 'beghemba',\n",
       " 'beghetuh',\n",
       " 'belemoih',\n",
       " 'belen',\n",
       " 'belengaih',\n",
       " 'belutin',\n",
       " 'ber-ghetoh',\n",
       " 'berderemen',\n",
       " 'beretuh beghetuh',\n",
       " 'bergenuak',\n",
       " 'besaq gabai',\n",
       " 'beskat gheghek',\n",
       " 'biaq',\n",
       " 'bijik qhe-mia',\n",
       " 'biruu',\n",
       " 'buah ce-qhe-mai',\n",
       " 'buah qhe-yang-dot',\n",
       " 'calaq',\n",
       " 'camdek candek',\n",
       " 'cedoq',\n",
       " 'cekeding',\n",
       " 'celuih',\n",
       " 'cempeqha cempera',\n",
       " 'cempong',\n",
       " 'cengeh',\n",
       " 'cepung',\n",
       " 'ceqhoi',\n",
       " 'ceroi',\n",
       " 'cewi',\n",
       " 'cey-mey',\n",
       " 'chek',\n",
       " 'chemuih',\n",
       " 'cheq',\n",
       " 'chewi',\n",
       " 'chombi',\n",
       " 'ciwi',\n",
       " 'conek',\n",
       " 'contiang',\n",
       " 'cucoq coq',\n",
       " 'da-oh',\n",
       " 'dapoq',\n",
       " 'daun karipole',\n",
       " 'en-jos',\n",
       " 'ga-qhet',\n",
       " 'gabuih',\n",
       " 'ge-qhet',\n",
       " 'gey-qhek',\n",
       " 'ghabat',\n",
       " 'ghamei',\n",
       " 'ghaplah jingga',\n",
       " 'ghe-nyeh',\n",
       " 'ghelaih',\n",
       " 'gherit',\n",
       " 'ghiang',\n",
       " 'gomoi',\n",
       " 'gostan',\n",
       " 'gulong',\n",
       " 'gurmit',\n",
       " 'guruih api',\n",
       " 'ha-qhin',\n",
       " 'habaq',\n",
       " 'habaq mai',\n",
       " 'haluih',\n",
       " 'hamboq',\n",
       " 'handuh',\n",
       " 'hangpa',\n",
       " \"hapak pera'e\",\n",
       " 'haprak',\n",
       " 'hapuih',\n",
       " 'haria',\n",
       " 'hawaq',\n",
       " 'hawin',\n",
       " 'hawing',\n",
       " 'he-qhot petot',\n",
       " 'hebiaq',\n",
       " 'hemoi',\n",
       " 'herot petot',\n",
       " 'hijau me-qhe-ngak',\n",
       " 'hingaq',\n",
       " 'ikan ka-qhen',\n",
       " 'ikan temenung',\n",
       " 'je gheluih',\n",
       " 'je-qhang',\n",
       " 'jebon',\n",
       " 'jora',\n",
       " 'juruih',\n",
       " 'kahaq',\n",
       " 'kambee',\n",
       " 'kamjat',\n",
       " 'kampoi',\n",
       " 'kanyaq',\n",
       " 'kanyiaq',\n",
       " 'kapok',\n",
       " 'katok',\n",
       " 'kawaq',\n",
       " 'kebuloq',\n",
       " 'kedekut pait',\n",
       " 'kedeqhat',\n",
       " 'kekwat',\n",
       " 'keleboq',\n",
       " 'kelepiaq',\n",
       " 'keloi',\n",
       " 'kelolo',\n",
       " 'keraih kedekiang',\n",
       " 'kerin',\n",
       " 'ketegaq',\n",
       " 'ketit',\n",
       " 'koi-koi',\n",
       " 'kok-ko',\n",
       " 'kona',\n",
       " 'kongkiaq',\n",
       " 'kore',\n",
       " 'koro',\n",
       " 'koroi',\n",
       " 'koyak minyak',\n",
       " 'ku-tey',\n",
       " 'kueh dangai',\n",
       " 'kueh ka-qhas',\n",
       " 'kurang ajaq',\n",
       " 'kutey',\n",
       " 'lachuk',\n",
       " 'lagu mana',\n",
       " 'lancha',\n",
       " 'lapaq ayaq',\n",
       " 'lehiaq',\n",
       " 'lekaih',\n",
       " 'lencun basah lencun',\n",
       " 'ligan',\n",
       " 'lincun lencun',\n",
       " 'lintaq',\n",
       " 'lit-lit',\n",
       " 'log laq',\n",
       " 'lokoih',\n",
       " 'loqlaq',\n",
       " 'lumpoq',\n",
       " 'maarop sintok',\n",
       " 'makan poloq',\n",
       " 'mami',\n",
       " 'mampoih',\n",
       " 'mamu',\n",
       " 'mana aci',\n",
       " 'mandom',\n",
       " 'maneh melecaih',\n",
       " 'mang-kark',\n",
       " 'manih melecaih',\n",
       " 'mankaq',\n",
       " 'mapley',\n",
       " 'marka',\n",
       " 'mastoura',\n",
       " 'me-qhe-kah',\n",
       " 'me-qhe-nyeh',\n",
       " 'me-qhe-nyut',\n",
       " 'me-qhe-yup',\n",
       " 'me-qhela',\n",
       " 'me-qhelit me-qhelap',\n",
       " 'me-qho-not',\n",
       " 'megheben',\n",
       " 'meghelit',\n",
       " 'meksi',\n",
       " 'melahaq',\n",
       " 'melengung',\n",
       " 'melewaq',\n",
       " 'melilau',\n",
       " 'melo-qhoih',\n",
       " 'meluat',\n",
       " 'melugai',\n",
       " 'meneghas',\n",
       " 'meneghu',\n",
       " 'mengacom',\n",
       " 'mengkala',\n",
       " 'mengketedarah',\n",
       " 'menyebai',\n",
       " 'menyoronot',\n",
       " 'meqeloh',\n",
       " 'meqhela',\n",
       " 'meqoyak meghoyak',\n",
       " 'merah merengau',\n",
       " 'merap',\n",
       " 'merelop',\n",
       " 'merengit',\n",
       " 'merenyam',\n",
       " 'merton',\n",
       " 'morey',\n",
       " 'mulut hebiaq',\n",
       " 'nala',\n",
       " 'nalanya',\n",
       " 'nalla',\n",
       " 'nerai',\n",
       " 'ngenjot',\n",
       " 'nyeh',\n",
       " 'nyok',\n",
       " 'paket',\n",
       " 'panchoq',\n",
       " 'pandel',\n",
       " 'pasai pa',\n",
       " 'pasemboq',\n",
       " 'pe-daq',\n",
       " 'pe-qhembang',\n",
       " 'pe-qho-ngoih',\n",
       " 'pedai ayam',\n",
       " 'pedajai',\n",
       " 'pedo',\n",
       " 'pedoo',\n",
       " 'pelaq',\n",
       " 'pelekoh',\n",
       " 'peng-kaq',\n",
       " 'pengkaq',\n",
       " 'peqhasat',\n",
       " 'perabih buang',\n",
       " 'perak peghak',\n",
       " 'perenguih',\n",
       " 'peysos',\n",
       " 'pok kai',\n",
       " 'pokkay',\n",
       " 'ponen',\n",
       " 'ponu',\n",
       " 'pre-wel',\n",
       " 'pungkoq',\n",
       " 'puruih',\n",
       " 'qha-bat',\n",
       " 'qha-bit',\n",
       " 'qha-gaih',\n",
       " 'qha-let',\n",
       " 'qhamaih',\n",
       " 'qhan-dok',\n",
       " 'qhe-ling',\n",
       " 'qhe-ngau',\n",
       " 'qhe-tiaq',\n",
       " 'qhedah',\n",
       " 'qhem-pang',\n",
       " 'qhen-jong',\n",
       " 'qhengkong',\n",
       " 'qhim-bok',\n",
       " 'qhodong',\n",
       " 'qragaih',\n",
       " 'rabat',\n",
       " 'ragaih',\n",
       " 'reban gheban',\n",
       " 'renjong qhenjong',\n",
       " 'renyeh qhenyeh',\n",
       " 'reyau qhiyau',\n",
       " 'rosyom',\n",
       " 'sakit pala',\n",
       " 'se-gheyau',\n",
       " 'se-ghoh',\n",
       " 'se-qheyat',\n",
       " 'sebek',\n",
       " 'segheyat',\n",
       " 'seghiyau',\n",
       " 'seghobeh',\n",
       " 'selepong',\n",
       " 'seluloi',\n",
       " 'sendai',\n",
       " 'sene-qheh',\n",
       " 'seneghih',\n",
       " 'seneqih',\n",
       " 'serlum',\n",
       " 'simboq',\n",
       " 'singkiaq',\n",
       " 'siru',\n",
       " 'solat lohoq',\n",
       " 'suap gula',\n",
       " 'taboh',\n",
       " 'tahaq',\n",
       " 'tak celuih',\n",
       " 'tak dan',\n",
       " 'tak kelapaq',\n",
       " 'takdan',\n",
       " 'takek',\n",
       " 'tamboq',\n",
       " 'taqham',\n",
       " 'tarak mau',\n",
       " 'tarantuk',\n",
       " 'tauk',\n",
       " 'tawaq',\n",
       " 'te-qhe-bey',\n",
       " 'tembun',\n",
       " 'tengalong',\n",
       " 'terai',\n",
       " 'terbei teghebei',\n",
       " 'tere',\n",
       " 'terebey',\n",
       " 'terejai',\n",
       " 'tergelinciaq',\n",
       " 'tergheliat',\n",
       " 'terjuntai',\n",
       " 'terpe gheluih',\n",
       " 'terpelohong',\n",
       " 'terpereluih',\n",
       " \"terple'ot\",\n",
       " 'tertomoih',\n",
       " 'tetomoih',\n",
       " 'tey-liang',\n",
       " 'tibai',\n",
       " 'tidoq',\n",
       " 'to-qheh',\n",
       " 'tok yu',\n",
       " 'toksah',\n",
       " 'tokua',\n",
       " 'topui',\n",
       " 'toslet',\n",
       " 'toya mat amin',\n",
       " 'toyu',\n",
       " 'tungging buyung',\n",
       " 'ubi mengala',\n",
       " 'usha',\n",
       " \"ya'apur seyre\"}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words = [i for i in words if len(i) > 3]\n",
    "    \n",
    "words = set(words) - malays\n",
    "words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "321"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('kedah-words.json', 'w') as fopen:\n",
    "    json.dump(list(words), fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
